Title
summary
import networkx as nx
import pandas as pd
import os
import json
import ast
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from IPython.display import Image
from IPython.core.display import HTML
pd.set_option('display.max_colwidth',1000)
-
What are we trying to do here?
- We want to create vertical search to find TED videos based on our query.
- Dataset: All video recordings of TED Talks uploaded on official TED website until Sept 2017.
- Dataset contains columns talk title, transcript, url, main speaker, related ted talks for any ted talk</b></ol>
</div>
</div>
</div>
path_to_data = os.getcwd() + "\\data\\" ted_main_filepath = path_to_data + "ted_main.csv" transcripts_filepath = path_to_data + "transcripts.csv" ted_main_df = pd.read_csv(ted_main_filepath) ted_main_df = ted_main_df[['title', 'url', 'related_talks']] transcripts_df = pd.read_csv(transcripts_filepath) print(ted_main_df.shape) print(transcripts_df.shape) #merge the two dataframes to create one. final_ted_df = transcripts_df.merge(ted_main_df, on="url") print(final_ted_df.shape)
# Pagerank algorithm tries to find the most prominent web pages in a network of web pages. # Essentially pagerank of a webpage or node in the network is dependent on its immediate neighbour's rank and so and so forth. # A node with a higher pagerank is cited by other highly pageranked nodes # In our case of vertical search of TED videos based on search query, we will create a directed graph of ted videos as nodes # and directed edge to all related ted videos from the source ted video. # Assumption: If a ted video is in recommendations of high ranked ted videos it must be high ranked as well. #To create a directed graph we will use networkx library # we need to create a dataframe of all edges (source ted video, recommended ted video) recommendations_df = final_ted_df[["title","related_talks"]] print(recommendations_df)
def recommended_titles_list(reco_str): data = json.dumps(ast.literal_eval(reco_str)) jdata = json.loads(data) titles_list = [] for data in jdata: titles_list.append(data['title']) return titles_list
#Take each line from sheet and write to a graph with "title" and "related_title". columns = ['title', 'related_title'] edges_df = pd.DataFrame(columns=columns) for index, row in recommendations_df.iterrows(): title = row['title'] reco_list = recommended_titles_list(row['related_talks']) for reco_title in reco_list: edges_df = edges_df.append({'title':title, 'related_title':reco_title}, ignore_index=True) print(edges_df.head(5)) # There are 14802 directed edges in the graph. print(edges_df.shape)
# Create the directed graph from edges dataframe using networkx di_reco_graph = nx.from_pandas_edgelist(edges_df,'title','related_title', create_using=nx.DiGraph())
#Print generic info about directed graph print(nx.info(di_reco_graph)) # Pagerank is a variant of eigenvector. Hence we find eigenvectors for each node (ted_video) eigenvector_dict = nx.eigenvector_centrality(di_reco_graph) # normalize the eigenvectors (b/w 0 and 1) factor=1.0/sum(eigenvector_dict.values()) normalised_eigenvector_dict = {k: v*factor for k, v in eigenvector_dict.items() } #print(normalised_eigenvector_dict) #print({k: v for k, v in sorted(normalised_eigenvector_dict.items(), key=lambda item: item[1], reverse = True)}) # Add the eigen vector to final_ted_df dataframe. eigenvectors_df = pd.DataFrame(normalised_eigenvector_dict.items(), columns=['title', 'eigenvector_value']) final_ted_df = final_ted_df.merge(eigenvectors_df,on="title") print(final_ted_df.head(1))
#TODO: Insert graphs from gephi, and modularity analysis edges_df.to_csv('graph_edges.csv')
# Lets take a detour to Network Analytics using a WYSIWYG software called Gephi (Download here). # We can create a directed graph from spreadsheet. We can analyse eigen vectors, degree, pagerank, and modularity. # Modularity is.... image_folder_path = os.getcwd() + "\\img\\" # Following is the picture of directed graph Image(filename = image_folder_path + "overall_graph.png", width=250, height=250)
#Following is the picture of biggest modularity class (subgroup) and the associated data. x = Image(filename = image_folder_path + "technology&innovation_module.png", width=250, height=250) y = Image(filename = image_folder_path + "technology&innovation_module_data.png", width=800, height=800) display(x,y)
#Following is the picture of second biggest modularity class (subgroup) and the associated data. x = Image(filename = image_folder_path + "art_design_arch.png", width=250, height=250) y = Image(filename = image_folder_path + "art_design_arch_data.png", width=800, height=800) display(x,y)
# We have transcript of all the talks. Hence we can create a TFIDF keywords # We can create a TFIDF matrix of transript terms for all talks. We will use TFIDF vectorizer of SKLearn. tfidf_vector = TfidfVectorizer(stop_words='english') tfidf_values = tfidf_vector.fit_transform(final_ted_df['transcript']) tfidf_matrix = tfidf_values.toarray() print(tfidf_matrix.shape) #it has fonud 58795 features (columns) for 2467 ted videos (rows) # show some 50 features out of identified 58795 features. print(tfidf_vector.get_feature_names()[5000:5050]) # If you scroll down it has lot of features (terms) identified from transcript. # As we see that some of the numbers have been identified as features which could be avoided by preprocessing data. # It will be done in the next version.
#Now that we are done with eigen vectors and TFIDF. # For search query entered we need to create matching scores word in search query for all TED videos and sum them. # Since we intend to show top 5 searches, we will show the top 5 TED videos based on matching scores. #search_query = "schools" #search_query = "technology and robots" search_query = "inspiration and courage"
# Matching score for all the TED videos. # Get search tokens in the search query search_tokens = search_query.split(" ") # Find the index of all search tokens in feature names obtained from TFIDF vectorizer. feature_names = tfidf_vector.get_feature_names() token_indexes = list() for token in search_tokens: if token in feature_names: index = feature_names.index(token) token_indexes.append(index) if len(token_indexes) == 0: # No search term found in the feature names. Return no results. print("No results") else: print(token_indexes) matching_scores = np.zeros(2467) for index in token_indexes: matching_scores = np.add(matching_scores, tfidf_matrix[:,index]) print(matching_scores) print(matching_scores.shape)
</div># Create a dataframe with title,url,matching_scores,eigen vector values search_dataframe = final_ted_df[['title','url', 'eigenvector_value']] search_dataframe = search_dataframe.assign(matching_scores=matching_scores) search_dataframe['total_score'] = 0.5 * search_dataframe['matching_scores'] + 0.5 * search_dataframe['eigenvector_value'] search_dataframe.sort_values(['total_score'], ascending=[False], inplace = True) # Show the top 5 search results print(search_dataframe.head(5)[['title','url']])